# Replication File for Seljan and Gronke, 2022, "Happy Birthday You Get
# To Vote". 
#
# CALIFORNIA: Anonymize voter files for distribution 
#
# This file takes as input a May 2019 voter list file, obtainable
# from the Division of Elections, State of Calfornia. This file represents
# the registration system after AVR was implemented in 2018 and after 
# one midterm election.
# 
# We have not
# distributed the raw data file because it contains personally identifiable
# information for millions of individual registrants. 
#
# The output files contain all data elements necessary to replicate the 
# full analysis.

library(data.table)
library(lubridate)
library(mosaic)
library(stringr)
library(tidyverse)
library(wru)
library(gender)
library(genderdata)
library(fastDummies)


# California Voter File --------------------------------------------------

# This script creates the file CA_file_anon.rds from original voter file.

# Reading data


CA_file<-fread("831-37089-59-pvrdr-vrd-20190528-1007.TXT", header=TRUE, select = c("CountyCode", "RegistrantID", "LastName", "FirstName", "DOB", "PartyCode", "Status", "RegistrationDate", "RegistrationMethodCode", "VoterStatusReasonCodeDesc", "VbmVoterType", "EmsVoterId" ))
# 20043128 observations

# read birthdates and registration dates as lubridates

# read birthdates and registration dates as lubridates

CA_file$lub_birthdate<-ymd(CA_file$DOB)
CA_file$birth_year<-year(CA_file$lub_birthdate)
CA_file$birth_day_of_year<-yday(CA_file$lub_birthdate)
CA_file$lub_regdate<-ymd(CA_file$RegistrationDate)
CA_file$reg_year<-year(CA_file$lub_regdate)
CA_file$reg_day_of_year<-yday(CA_file$lub_regdate)
CA_file$birthday_string<-str_sub(CA_file$DOB, 1, 5)

# Registration year variables

CA_file$regyear2019<-ifelse(CA_file$reg_year==2019, 1, 0)
CA_file$regyear2018<-ifelse(CA_file$reg_year==2018, 1, 0)
CA_file$regyear2017<-ifelse(CA_file$reg_year==2017, 1, 0)
CA_file$regyear2016<-ifelse(CA_file$reg_year==2016, 1, 0)
CA_file$regyear2015<-ifelse(CA_file$reg_year==2015, 1, 0)
CA_file$regyear2014<-ifelse(CA_file$reg_year==2014, 1, 0)

# In California, the deadline to register to vote for any election is 15 days
# before Election Day,
electionday2018<-yday(ymd(181106))
deadlineNov2018<-yday(ymd(181022))

# Read and merge voter history

CA_history<-fread("831-37089-59-pvrdr-vph-20190528-1007.TXT", header=TRUE, select = c("RegistrantID", "ElectionType", "ElectionDate"))
CA_history$lub_electdate<-ymd(CA_history$ElectionDate)
CA_history<-CA_history %>%
  filter (lub_electdate==ymd(180605) | lub_electdate==ymd(181106) | lub_electdate==ymd(1400603) | lub_electdate==ymd(141104) | lub_electdate==ymd(160607) | lub_electdate==ymd(161108) | lub_electdate==ymd(121106) )

CA_history <- CA_history %>% unite(ElectionType, ElectionDate, col="ElectionSlug", sep = "_", remove = TRUE)
CA_history <- CA_history %>% select(-lub_electdate)
CA_history$voted<-1
CA_history <- CA_history %>% distinct()
CA_history <- spread(CA_history, ElectionSlug, voted)

CA_history<- CA_history %>% select(-`GP_2018-06-05`, -`PP_2016-06-07`, -`MU_2016-06-07`, -`CG_2014-11-04`, -`GP_2014-11-04`, -`OT_2014-11-04`)
CA_history<- CA_history %>% replace(is.na(.), 0)

CA_history<- CA_history %>%
  rename(vote_yes2012="PG_2012-11-06") %>%
  rename(vote_yes2014="GG_2014-11-04") %>%
  rename(vote_yes2016="PG_2016-11-08") %>%
  rename(vote_yes2018="GG_2018-11-06") 

CA_file<-left_join(CA_file, CA_history)


CA_file<- CA_file %>% mutate(vote_yes2012 = replace_na(vote_yes2012, 0))
CA_file<- CA_file %>% mutate(vote_yes2014 = replace_na(vote_yes2014, 0))
CA_file<- CA_file %>% mutate(vote_yes2016 = replace_na(vote_yes2016, 0))
CA_file<- CA_file %>% mutate(vote_yes2018 = replace_na(vote_yes2018, 0))

# Compute age at November 2016 General Election accounting for leap years
CA_file <- CA_file  %>%
  mutate(age_at_2018election = case_when(birth_day_of_year+1<=electionday2018 & leap_year(lub_birthdate)==TRUE ~ 2018-birth_year, 
                                         birth_day_of_year<=electionday2018 & leap_year(lub_birthdate)==FALSE ~ 2018-birth_year, 
                                         birth_day_of_year+1>electionday2018 & leap_year(lub_birthdate)==TRUE ~ 2018-birth_year-1,
                                         TRUE ~ 2018-birth_year-1))


#Limit to those aged 18 plus at time of election
CA_file <- CA_file  %>% filter(age_at_2018election>=18)


# Create populous county dummy (Defined as Counties over 150K)

CA_file <- CA_file  %>% mutate(populous_county=case_when(CountyCode==2~0,
                                                         CountyCode==46~0,
                                                         CountyCode==25~0,
                                                         CountyCode==53~0,
                                                         CountyCode==26~0,
                                                         CountyCode==22~0,
                                                         CountyCode==14~0,
                                                         CountyCode==32~0,
                                                         CountyCode==6~0,
                                                         CountyCode==8~0,
                                                         CountyCode==11~0,
                                                         CountyCode==18~0,
                                                         CountyCode==3~0,
                                                         CountyCode==47~0,
                                                         CountyCode==5~0,
                                                         CountyCode==55~0,
                                                         CountyCode==35~0,
                                                         CountyCode==52~0,
                                                         CountyCode==17~0,
                                                         CountyCode==58~0,
                                                         CountyCode==23~0,
                                                         CountyCode==51~0,
                                                         CountyCode==29~0,
                                                         CountyCode==12~0,
                                                         CountyCode==28~0,
                                                         TRUE~1
                                                         
))


# Impute Race Variables

CA_file$surname<-CA_file$LastName
CA_file$surname<- str_replace(CA_file$surname,"\xd1", "N")

CA_fips<- tibble::tribble(
  ~county,      ~county_name, ~state, ~CountyCode,
  "001",         "ALAMEDA",   "CA",          1L,
  "003",          "ALPINE",   "CA",          2L,
  "005",          "AMADOR",   "CA",          3L,
  "007",           "BUTTE",   "CA",          4L,
  "009",       "CALAVERAS",   "CA",          5L,
  "011",          "COLUSA",   "CA",          6L,
  "013",    "CONTRA COSTA",   "CA",          7L,
  "015",       "DEL NORTE",   "CA",          8L,
  "017",       "EL DORADO",   "CA",          9L,
  "019",          "FRESNO",   "CA",         10L,
  "021",           "GLENN",   "CA",         11L,
  "023",        "HUMBOLDT",   "CA",         12L,
  "025",        "IMPERIAL",   "CA",         13L,
  "027",            "INYO",   "CA",         14L,
  "029",            "KERN",   "CA",         15L,
  "031",           "KINGS",   "CA",         16L,
  "033",            "LAKE",   "CA",         17L,
  "035",          "LASSEN",   "CA",         18L,
  "037",     "LOS ANGELES",   "CA",         19L,
  "039",          "MADERA",   "CA",         20L,
  "041",           "MARIN",   "CA",         21L,
  "043",        "MARIPOSA",   "CA",         22L,
  "045",       "MENDOCINO",   "CA",         23L,
  "047",          "MERCED",   "CA",         24L,
  "049",           "MODOC",   "CA",         25L,
  "051",            "MONO",   "CA",         26L,
  "053",        "MONTEREY",   "CA",         27L,
  "055",            "NAPA",   "CA",         28L,
  "057",          "NEVADA",   "CA",         29L,
  "059",          "ORANGE",   "CA",         30L,
  "061",          "PLACER",   "CA",         31L,
  "063",          "PLUMAS",   "CA",         32L,
  "065",       "RIVERSIDE",   "CA",         33L,
  "067",      "SACRAMENTO",   "CA",         34L,
  "069",      "SAN BENITO",   "CA",         35L,
  "071",  "SAN BERNARDINO",   "CA",         36L,
  "073",       "SAN DIEGO",   "CA",         37L,
  "075",   "SAN FRANCISCO",   "CA",         38L,
  "077",     "SAN JOAQUIN",   "CA",         39L,
  "079", "SAN LUIS OBISPO",   "CA",         40L,
  "081",       "SAN MATEO",   "CA",         41L,
  "083",   "SANTA BARBARA",   "CA",         42L,
  "085",     "SANTA CLARA",   "CA",         43L,
  "087",      "SANTA CRUZ",   "CA",         44L,
  "089",          "SHASTA",   "CA",         45L,
  "091",          "SIERRA",   "CA",         46L,
  "093",        "SISKIYOU",   "CA",         47L,
  "095",          "SOLANO",   "CA",         48L,
  "097",          "SONOMA",   "CA",         49L,
  "099",      "STANISLAUS",   "CA",         50L,
  "101",          "SUTTER",   "CA",         51L,
  "103",          "TEHAMA",   "CA",         52L,
  "105",         "TRINITY",   "CA",         53L,
  "107",          "TULARE",   "CA",         54L,
  "109",        "TUOLUMNE",   "CA",         55L,
  "111",         "VENTURA",   "CA",         56L,
  "113",            "YOLO",   "CA",         57L,
  "115",            "YUBA",   "CA",         58L
)

CA_file<-left_join(CA_file,CA_fips,by="CountyCode")

CA_census_race<-get_census_data(key="ae147f31d2e5acfa2a6f8cafc8befe65dfd6d462", states="CA", age = FALSE, sex = FALSE,
                                census.geo = "county", retry = 0)
CA_file<-predict_race(voter.file = CA_file, census.data = CA_census_race, census.geo = "county", census.key ="ae147f31d2e5acfa2a6f8cafc8befe65dfd6d462")

CA_file$white <-ifelse(CA_file$pred.whi>CA_file$pred.bla & CA_file$pred.whi>CA_file$pred.his & CA_file$pred.whi>CA_file$pred.asi & CA_file$pred.whi>CA_file$pred.oth,1,0)
CA_file$black <-ifelse(CA_file$pred.bla>CA_file$pred.whi & CA_file$pred.bla>CA_file$pred.his & CA_file$pred.bla>CA_file$pred.asi & CA_file$pred.bla>CA_file$pred.oth,1,0)
CA_file$hispanic <-ifelse(CA_file$pred.his>CA_file$pred.bla & CA_file$pred.his>CA_file$pred.whi & CA_file$pred.his>CA_file$pred.asi & CA_file$pred.his>CA_file$pred.oth,1,0)
CA_file$other_race <-ifelse(CA_file$pred.oth>CA_file$pred.bla & CA_file$pred.oth>CA_file$pred.whi & CA_file$pred.oth>CA_file$pred.asi & CA_file$pred.oth>CA_file$pred.his,1,0)
CA_file$asian <-ifelse(CA_file$pred.asi>CA_file$pred.bla & CA_file$pred.asi>CA_file$pred.whi & CA_file$pred.asi>CA_file$pred.his & CA_file$pred.asi>CA_file$pred.oth,1,0)


# Predict gender


ssa<-genderdata::ssa_national
ssa<-ssa %>%
  select(name, year) %>%
  rename(FirstName=name,
         birth_year=year) %>%
  mutate(ssa_exist=1) %>%
  mutate(FirstName=toupper(FirstName))


CA_file$FirstName<- str_replace(CA_file$FirstName,"\xd1", "N")
CA_file_g<-semi_join(CA_file, ssa, by= c("FirstName", "birth_year"))

gender<- CA_file_g %>% 
  distinct(FirstName, birth_year) %>% 
  group_by(birth_year) %>% 
  do(results = gender(.$FirstName, years = .$birth_year[1], method = "ssa")) %>% 
  do(bind_rows(.$results))

rm(ssa, CA_file_g)

gender<-gender_df(CA_file_g, name_col = "FirstName", year_col = "birth_year")
gender$female <-ifelse(gender$proportion_female>=0.5,1,0)
gender$male <-ifelse(gender$proportion_female<0.5,1,0)
gender$birth_year<-gender$year_min
gender$FirstName<-gender$name

gender<-gender %>% distinct(birth_year, FirstName, female, male)

CA_file<-left_join(CA_file,gender,by= c("FirstName", "birth_year"))

CA_file$nogender<-ifelse(is.na(CA_file$female), 1, 0)
CA_file$female<- replace_na(CA_file$female, 0)
CA_file$male<- replace_na(CA_file$male, 0)


# Limit data so anonymous by individual attributes

CA_file<-CA_file  %>%
  select(DOB, PartyCode, Status, RegistrationDate, RegistrationMethodCode, 
         VoterStatusReasonCodeDesc,vote_yes2012, vote_yes2014, vote_yes2018, vote_yes2016, 
         populous_county, white, black, hispanic, other_race,
         asian, nogender, female, male) 

saveRDS(CA_file, "CA2018_anon.rds")



